What is the impact of each eigenvector to the normalized spectral clustering algorithm? While incrementing $k$, the number of clusters, how do the clusterings change? What kinds of datasets can we cluster well using spectral clustering?
from lib.spectral_clustering import spectral_clustering, laplacian_matrix, similarity_matrix
from lib.datasets import gaussian_mixture
from lib.kmeans import kmeans
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from matplotlib import cm
from tqdm import tqdm
n_gaussians = 5
n_pts = 4
n = n_pts * n_gaussians
d = 2
data = gaussian_mixture(n_gaussians, n_pts, d, centroid_var=10)
plt.scatter(*data.T)
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
plt.title("Sample of Gaussian Mixture")
plt.show()
_, (evals, evecs) = spectral_clustering(data, k=n, lform="rw", with_eigen=True, metric="e")
plt.imshow(evecs)
plt.axis('off')
plt.title("Eigenvectors of Graph Laplacian")
plt.show()
The following plot shows eigenvalue growth rate.
plt.plot(evals, "bo-")
plt.xlabel("Index")
plt.ylabel("$\lambda_i$")
plt.xticks(range(n))
plt.title("Eigenvalues of Similarity Graph Laplacian")
plt.show()
Each subplot shows classification decisions using a certain number of eigenvectors of the normalized graph Laplacian. They are cumulative in the sense that plot $(i+1)$ includes all vectors used to cluster plot $(i)$, with an extra one.
cmap = cm.get_cmap("tab20")
# !! if n > 20 then multiple indices end up in the same color bin, inducing a seemingly bad clustering
unif_colors = [cmap(intensity) for intensity in np.linspace(0, 1, n)]
r = n_pts
c = n_gaussians
# r * c = N
for i in tqdm(range(1,n+1)):
#evecs = np.fliplr(evecs)
_, assns = kmeans(evecs[:, :i], i, iters=100)
data_clusters = [ data[assns == clss].T for clss in range(i) ]
plt.subplot(r, c, i)
plt.title(f"{i} evecs")
plt.gca().set_xticks([], [])
plt.gca().set_yticks([], [])
for j, data_cluster in enumerate(data_clusters):
plt.scatter(*data_cluster, color=unif_colors[j])
plt.gcf().set_size_inches(10, 10)
plt.tight_layout()
plt.savefig("experiments/cumulative_eigenvectors/Cumulative_Evecs_Descend.png")
plt.show()
Each subplot shows the classification of points using only the sign of the components of a partiular eigenvector.
Hypothesis: the $n$-th eigenvector is an approximate indicator of the $n$-th most refined cluster.
r = n_pts
c = n_gaussians
# r * c = N
for i in tqdm(range(0, n)):
assns = (evecs[:, i] > 0)
data_clusters = [ data[assns == clss, :].T for clss in [0, 1] ] # two class case
plt.subplot(r, c, i+1)
plt.title(f"{i+1} evecs")
plt.gca().set_xticks([], [])
plt.gca().set_yticks([], [])
for j, data_cluster in enumerate(data_clusters):
plt.scatter(*data_cluster)
plt.gcf().set_size_inches(10, 10)
plt.tight_layout()
plt.savefig("experiments/cumulative_eigenvectors/Individual_Eigenvectors.png")
plt.show()
How well can k-means and spectral clustering identify the moons dataset?
from lib.kmeans import kmeans
from lib.spectral_clustering import spectral_clustering
from sklearn.datasets import make_moons
data, _ = make_moons(200, noise=0.05)
plt.scatter(*data.T)
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
plt.title("Sample of Moons Data")
plt.show()
plt.subplot(1, 3, 1)
assns = spectral_clustering(data, k=2, lform="rw", metric="g", s=0.1)
data_clusters = [ data[assns == clss].T for clss in range(n_gaussians) ]
for j, data_cluster in enumerate(data_clusters):
plt.scatter(*data_cluster, color=unif_colors[j], label=f"{j}")
plt.title("Gaussian Kernel, $\sigma=0.1$")
plt.subplot(1, 3, 2)
assns = spectral_clustering(data, k=2, lform="rw", metric="g", s=1)
data_clusters = [ data[assns == clss].T for clss in range(n_gaussians) ]
for j, data_cluster in enumerate(data_clusters):
plt.scatter(*data_cluster, color=unif_colors[j], label=f"{j}")
plt.title("Gaussian Kernel, $\sigma=1$")
plt.subplot(1, 3, 3)
_, assns = kmeans(data, k=2)
data_clusters = [ data[assns == clss].T for clss in range(n_gaussians) ]
for j, data_cluster in enumerate(data_clusters):
plt.scatter(*data_cluster, color=unif_colors[j], label=f"{j}")
plt.title("K-means++")
plt.suptitle("Spectral Clustering with Gaussian Kernel vs. K-means")
plt.gcf().set_size_inches(15, 5)
plt.savefig("Moons_Comparison.png", dpi=120)